from __future__ import print_function, division
import numpy as np
import pymc
from matplotlib import pyplot as plt
%matplotlib inline
First, we load the demo pRESTO sequence read data and extract only the quality scores (every fourth line of the file).
We include a function that maps the quality data string onto Phred scores. For example: "\$%\$" -> [3,4,3]
R1, R2 = open("MiSeqDemo_R1.fastq"), open("MiSeqDemo_R2.fastq")
quals_R1 = [line.rstrip() for n,line in enumerate(R1.readlines()) if n%4==3]
quals_R2 = [line.rstrip() for n,line in enumerate(R2.readlines()) if n%4==3]
_qualstr = list('''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''')
assert _qualstr == [chr(n) for n in range(33,33+94)]
def qmap(n): return ord(n)-33
def qmaps(nstr): return [qmap(n) for n in nstr]
assert(qmap("#") == 2)
print("Loaded two paired-end read files, with", len(quals_R1),"reads total")
Here I plot some of the data to see what it looks like.
def plot_quals(quals_R):
qs = np.array([qmaps(q) for q in quals_R])
qmean, qstd = qs.mean(axis=0), qs.std(axis=0)
qprob_30 = np.sum(qs>30, axis=0) / qs.shape[0]
qprob_2 = np.sum(qs>2, axis=0) / qs.shape[0]
plt.figure(figsize=(16,4))
plt.ylim([0,50.25])
plt.title("Mean/Std Q (blue), Prob Q>30 (purple), Prob Q>2 (red)")
_ = plt.errorbar(range(len(qmean)), qmean, yerr=qstd)
_ = plt.plot(range(len(qmean)), qprob_30 * 50)
_ = plt.plot(range(len(qmean)), qprob_2 * 50)
plot_quals(quals_R1)
#plot_quals(quals_R2)
This is another way to look at the data. First we transform the Phred scores to the expected probability of error. Then we plot the probability of error, averaged over all reads (plus one example read for comparison.)
We can see that the error rate transitions from good (< 0.01) to bad (>.1) at around position 100. By the end of the read, at position 250, the error rate is >0.3.
num_nts = len(quals_R1[0])
num_reads = len(quals_R1)
print("Read length:", num_nts, "\nNumber of reads:", num_reads)
assert(all(len(q)==num_nts for q in quals_R1))
errors = 10**(np.array([qmaps(q) for q in quals_R1]) / -10)
plt.figure(figsize=(16,4))
plt.title("Average estd. errors (blue), and one example read (purple)")
_ = plt.plot(errors.sum(axis=0) / errors.shape[0])
_ = plt.plot(errors[1])
The general idea of the model is that there is a phase transition, from good quality data to bad. Within each phase, the error frequency increases linearly but with different parameters. The probability of error is Bernoulli-distributed. I started with the example coal-mining data model: https://github.com/pymc-devs/pymc/blob/master/pymc/examples/disaster_model.py
I had a few problems encoding the model:
import theano
with pymc.Model() as model:
switchpoint = pymc.DiscreteUniform('switchpoint', lower=0, upper=num_nts)
early_intercept = pymc.Uniform('early_intercept', lower=0, upper=1)
#early_intercept = pymc.Normal('early_intercept', mu=0, tau=1)
early_slope = pymc.Normal('early_slope', mu=0, tau=.1)
late_intercept = pymc.Uniform('late_intercept', lower=0, upper=1)
#late_intercept = pymc.Normal('late_intercept', mu=0, tau=1)
late_slope = pymc.Normal('late_slope', mu=0, tau=.1)
idx = np.arange(num_nts)
rate = pymc.switch(switchpoint >= idx,
early_intercept + early_slope*idx,
late_intercept + late_slope*(idx-switchpoint))
#errord = pymc.Bernoulli('errord', rate, observed=errors)
errord = pymc.Poisson('errord', rate, observed=errors)
The results are plotted below.
This model has some issues, as described above, and is really only suitable as a toy example. Still, it would be interesting to look at the same model applied to different sequencing runs to see if the switchpoint and slopes are consistent.
with model:
# Initial values for stochastic nodes
start = {'early_intercept': .1, 'early_slope':.1, 'late_intercept': .2, 'late_slope':.1}
#start = pymc.find_MAP()
# Use slice sampler for means, Metropolis for discrete
step1 = pymc.Slice([early_intercept, early_slope, late_intercept, late_slope])
step2 = pymc.Metropolis([switchpoint])
tr = pymc.sample(500, start=start, step=[step1, step2])
pymc.traceplot(tr[:150])
plt.figure(figsize=(16,6))
_ = plt.plot(errors.sum(axis=0) / errors.shape[0])
for i in range(int(len(tr)/2),len(tr)):
point = tr.point(i)
plt.plot(np.arange(point['switchpoint']),
point['early_intercept'] + point['early_slope']*np.arange(point['switchpoint']),
color='green', alpha=10./len(tr))
plt.plot(np.arange(point['switchpoint'], num_nts),
point['late_intercept'] + point['late_slope']*(np.arange(point['switchpoint'],num_nts)-point['switchpoint']),
color='red', alpha=10./len(tr))
This is a working, self-contained test to make sure PyMC3 is doing what I expect with a very simple model.
num_nts = 1000
errors = np.array([i for i in range(500)] + [200+i*3 for i in range(500)])
plt.figure(figsize=(16,4))
_ = plt.plot(errors)
with pymc.Model() as model:
switchpoint = pymc.DiscreteUniform('switchpoint', lower=0, upper=num_nts)
early_intercept = pymc.Exponential('early_intercept', lam=1.)
early_slope = pymc.Exponential('early_slope', lam=1.)
late_intercept = pymc.Exponential('late_intercept', lam=1.)
late_slope = pymc.Exponential('late_slope', lam=1.)
# Allocate appropriate Poisson rates to years before and after current
# switchpoint location
idx = np.arange(num_nts)
rate = pymc.switch(switchpoint >= idx,
early_intercept + early_slope*idx,
late_intercept + late_slope*(idx-switchpoint))
errord = pymc.Poisson('errord', rate, observed=errors)
# Initial values for stochastic nodes
start = {'early_intercept': 1, 'early_slope':.1, 'late_intercept': 30., 'late_slope':1}
# Use slice sampler for means, Metropolis for discrete
step1 = pymc.Slice([early_intercept, early_slope, late_intercept, late_slope])
step2 = pymc.Metropolis([switchpoint])
tr = pymc.sample(1000, start=start, step=[step1, step2])
pymc.traceplot(tr)